opening this file on mobile phones might result in bad graphs visualization
Please make sure you are connected to the internet, some of the graphs are imported from our Tableau cloud acount !!!
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,AdaBoostClassifier,AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from xgboost.sklearn import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import math
import itertools
from sklearn.decomposition import PCA as sklearnPCA
import plotly.offline as pyo
import plotly.express as px
# Set notebook mode to work in offline
pyo.init_notebook_mode()
def missing_value_of_data(data):
total=data.isnull().sum().sort_values(ascending=False)
percentage=round(total/data.shape[0]*100,2)
return pd.concat([total,percentage],axis=1,keys=['Total','Percentage %'])
def replace_names_with_titles(df):
name_titles = []
for name in df.Name:
comma = name.find(",") + 1
point = name.find(".",comma)
name_titles.append(name[comma:point])
df.Name = name_titles
df.rename(columns = {'Name':'Title'}, inplace = True)
def filling_age_by_titles_avg(df):
for title in set(df.Title):
title_age_avg = df[df.Title == title ].Age.mean()
# in case all the ages values for this title are NaN so we have no avg for them we will use the regular age avg instead
if math.isnan(title_age_avg) :
title_age_avg=df.Age.mean()
mask = df.Title == title
df.loc[mask, 'Age'] = df.loc[mask, 'Age'].fillna(value=title_age_avg)
def optimize_by_droping_features(train_x,train_y,test_x,test_y,min_features,score,model,param_grid):
curr_train_x = train_x
curr_test_x = test_x
min_features = min_features
best_score = score
isBetter = True
best_model = model
excluded_features = []
while isBetter :
if(len(curr_train_x.columns) > min_features):
features = list(curr_train_x.columns)
importance_df = pd.DataFrame({'feature': features,
'importance': best_model.feature_importances_}).\
sort_values('importance', ascending = False)
display(importance_df)
print()
least_important_feature = importance_df['feature'].iloc[-1]
print('Trying To Drop Feature : '+ least_important_feature)
curr_train_x = train_x.drop(least_important_feature, 1)
curr_test_x = test_x.drop(least_important_feature, 1)
print('Optimizing The Best Model Without The Feature , Please Wait ⏳ ... ')
gs = GridSearchCV(best_model,param_grid = param_grid, cv = 5, n_jobs = -1,iid=False)
gs.fit(curr_train_x,train_y)
current_model = gs.best_estimator_
pred_y = current_model.predict(curr_test_x)
current_score = accuracy_score(test_y, pred_y)
print("Accuracy After Droping Feature : ", current_score)
if(current_score >= best_score) :
excluded_features.append(least_important_feature)
print('Droping Feature Was Efficient ✔️')
print('excluded_features',excluded_features)
print()
train_x = curr_train_x
test_x = curr_test_x
best_score = current_score
best_model = current_model
pred_y = best_model.predict(test_x)
else:
print("Droping Feature Was Not Efficient ❌")
print()
print("Optimization By Droping Features Is Done")
print('excluded_features',excluded_features)
print()
isBetter = False
else :
print("Minimum Number Of Features Is Reached ⚠️")
print("Optimization By Droping Features Is Done")
print('excluded_features',excluded_features)
print()
isBetter = False
print('The Best Score For Model is : ',best_score)
return best_model , excluded_features
def final_train_and_predict(model,train_x,train_y,validation_x,validation_y,test,excluded):
train_x = pd.concat([train_x,validation_x]) ## train + validation
train_x = train_x.drop(excluded,1) ## removing feature that were excluded , if there is any
train_y = pd.concat([train_y,validation_y])
model.fit(train_x,train_y) ## train the model with train + validation
pred_y = model.predict(test.drop(excluded,1))
return pred_y
def export_to_csv (predictions, passengers_id, file_name):
predections = pd.DataFrame({'PassengerId' : list(passengers_id),'Survived' : list(predictions) }, columns=['PassengerId', 'Survived'])
predections.to_csv(file_name +".csv", index=False)
def view_correlation(df):
corr=df.corr()
plt.figure(figsize=(9,9))
sns.heatmap(corr, linewidths=0.01,square=True,annot=True,annot_kws={"size": 12},cmap='YlGnBu',linecolor="white" )
sns.set(font_scale=0.8)
plt.title('Correlation between features parameters matrix');
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.show()
normalize=True.def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Oranges):
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.figure(figsize = (10, 10))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 24)
plt.colorbar(aspect=4)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 14)
plt.yticks(tick_marks, classes, size = 14)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
# Labeling the plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.grid(None)
plt.tight_layout()
plt.ylabel('True label', size = 18)
plt.xlabel('Predicted label', size = 18)
def elbow_met(df):
distortions = []
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(df)
kmeanModel.fit(df)
distortions.append(sum(np.min(cdist(df, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / df.shape[0])
# Plot the elbow
plt.figure(figsize=(10,7))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Within groups sum of squares')
plt.title('The Elbow Method showing the optimal k')
plt.show()
train = pd.read_csv(r'C:\Users\L.A\Desktop\titanic\train.csv' , encoding = "ISO-8859-8")
test = pd.read_csv(r'C:\Users\L.A\Desktop\titanic\test.csv' , encoding = "ISO-8859-8")
passengers_id = test.PassengerId # to be used later when real life testing
%%HTML
<br>
<div class='tableauPlaceholder' id='viz1592437685721' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/ML/ML_embarked/Sheet1/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='ML_embarked/Sheet1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/ML/ML_embarked/Sheet1/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /><param name='filter' value='publish=yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1592437685721'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.55)+'px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
<div class='tableauPlaceholder' id='viz1592437607255' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/ML/ML_embarked/Sheet3/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='ML_embarked/Sheet3' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/ML/ML_embarked/Sheet3/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /><param name='filter' value='publish=yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1592437607255'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.45)+'px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
<br>
fig = px.scatter_3d(train, x='Age', y='Fare', z='Parch',
color='Survived')
fig.show()
missing_value_of_data(train)
train = train[(train.Embarked.notnull())]
#Tarin
train.drop('Cabin',1,inplace=True)
#Test
test.drop('Cabin',1,inplace=True)
#Train
train.drop('Ticket',1,inplace=True)
train.drop('PassengerId',1,inplace=True)
#Test
test.drop('Ticket',1,inplace=True)
test.drop('PassengerId',1,inplace=True)
replace_names_with_titles(train)
replace_names_with_titles(test)
%%html
<br>
<div class='tableauPlaceholder' id='viz1592587025547' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/Bo/Book2_15925864916510/Sheet1/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='Book2_15925864916510/Sheet1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Bo/Book2_15925864916510/Sheet1/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /><param name='filter' value='publish=yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1592587025547'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.55)+'px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
<br>
filling_age_by_titles_avg(train)
filling_age_by_titles_avg(test)
f = plt.figure(figsize=(20,6))
sns.scatterplot(x='Age',y='Fare',hue='Survived',data=train)
f = plt.figure(figsize=(20,6))
f.add_subplot(1,2,1)
sns.distplot(train['Age'])
f.add_subplot(1,2,2)
sns.boxplot(train['Age'])
missing_value_of_data(test)
test.Fare=test.Fare.fillna(test.Fare.mean())
le = preprocessing.LabelEncoder()
#Train
train.Embarked = le.fit_transform(train.Embarked)
train.Sex = le.fit_transform(train.Sex)
train.Title = le.fit_transform(train.Title)
#Test
test.Embarked = le.fit_transform(test.Embarked)
test.Sex = le.fit_transform(test.Sex)
test.Title = le.fit_transform(test.Title)
real_life_test = test.copy()
d2 = pd.DataFrame(sklearnPCA(n_components=2).fit_transform(train))
plt.figure(figsize=(30,10))
plt.scatter(d2.iloc[:, 0], d2.iloc[:, 1], s=50 );
plt.title('2d data visualization')
view_correlation(train.drop('Survived',1))
train.describe()
RSEED = 10
elbow_met(train[['Age','Fare','SibSp','Parch']])
train_with_cluster = train.copy()
real_life_test_with_cluster = real_life_test.copy()
k = 3
kmeans = KMeans(n_clusters = k).fit( train[['Age','Fare','SibSp','Parch']])
train_with_cluster['cluster']= kmeans.labels_ # add the clusters
kmeans = KMeans(n_clusters = k).fit(real_life_test)
real_life_test_with_cluster['cluster']= kmeans.labels_ # add the clusters
feature_cols = ['Pclass','Title','Sex', 'Age','SibSp','Parch','Fare','Embarked']
x = train[feature_cols]
y = train.Survived
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=1)
# with the cluster as a feature
feature_cols = ['Pclass','Title','Sex', 'Age','SibSp','Parch','Fare','Embarked','cluster']
x = train_with_cluster[feature_cols]
y = train_with_cluster.Survived
train_x_with_cluster, test_x_with_cluster, _, _ = train_test_split(x, y, test_size=0.3, random_state=1)
The base model
# Creating a Decision Tree Classifier
tree = DecisionTreeClassifier(random_state=RSEED)
tree.fit(train_x, train_y)
pred_y = tree.predict(test_x)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
The tuned model
# Hyperparameter grid
param_grid = {
'criterion':['gini', 'entropy'], # Whether the critertion of the tree is gini or entropy
'max_depth': [3,5,8,15,20,30,40, 50, 60] , # The maximum depth of the tree.
'max_features': ['sqrt', None,1,2,3,4,5,6,7], # The number of features to consider when looking for the best split
'min_samples_split': [2, 4,6,8,10,12,15], # The minimum number of samples required to split an internal node
}
# Create a grid search object
gs = GridSearchCV(tree, param_grid, cv=5, scoring='accuracy',iid=False)
# Fit the grid search
gs.fit(train_x, train_y)
pred_y = gs.predict(test_x)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
droping features
model=gs.best_estimator_
best_model,excluded_features = optimize_by_droping_features(train_x,train_y,test_x,test_y,3,score,model,param_grid)
final train and export to .csv
pred_y = final_train_and_predict(model,train_x,train_y,test_x,test_y,real_life_test,excluded_features)
export_to_csv(pred_y,passengers_id,'DecisionTreeWithoutCluster')
The base model
# Creating a Decision Tree Classifier
tree = DecisionTreeClassifier(random_state=RSEED)
tree.fit(train_x_with_cluster, train_y)
pred_y = tree.predict(test_x_with_cluster)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
The tuned model
# Create a grid search object
gs = GridSearchCV(tree, param_grid, cv=5, scoring='accuracy',iid=False)
# Fit the grid search
gs.fit(train_x_with_cluster, train_y)
pred_y = gs.predict(test_x_with_cluster)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
droping features
model=gs.best_estimator_
best_model,excluded_features = optimize_by_droping_features(train_x_with_cluster,train_y,test_x_with_cluster,test_y,3,score,model,param_grid)
final train and export to .csv
pred_y = final_train_and_predict(model,train_x_with_cluster,train_y,test_x_with_cluster,test_y,real_life_test_with_cluster,excluded_features)
export_to_csv(pred_y,passengers_id,'DecisionTreeWithCluster')
The base model
rfc = RandomForestClassifier(n_estimators = 300 ,random_state = RSEED ,min_samples_split = 5)
rfc = rfc.fit(train_x,train_y)
pred_y = rfc.predict(test_x)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
The tuned model
# Hyperparameter grid
param_grid = {
'n_estimators': [10,100, 200, 250] ,#The number of trees in the forest.
'max_depth': [None, 50, 60, 70] ,#The maximum depth of the tree.
'max_features': ['sqrt', None],#he number of features to consider when looking for the best split
'min_samples_split': [2, 10],#The minimum number of samples required to split an internal node
'bootstrap': [True, False]#Whether bootstrap samples are used when building trees.
}
rs = GridSearchCV(rfc, param_grid, n_jobs = -1,scoring = 'accuracy', cv = 5 ,iid=False)
rs.fit(train_x,train_y)
pred_y=rs.predict(test_x)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
droping features
model=rs.best_estimator_
best_model,excluded_features = optimize_by_droping_features(train_x,train_y,test_x,test_y,3,score,model,param_grid)
final train and export
pred_y = final_train_and_predict(best_model,train_x,train_y,test_x,test_y,real_life_test,excluded_features)
export_to_csv(pred_y,passengers_id,'RandomForestWithoutCluster')
The base model
rfc_with_cluster = RandomForestClassifier(n_estimators = 300 ,random_state = RSEED ,min_samples_split = 5)
rfc_with_cluster = rfc_with_cluster.fit(train_x_with_cluster,train_y)
pred_y = rfc_with_cluster.predict(test_x_with_cluster)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
The tuned model
rs = GridSearchCV(RandomForestClassifier(random_state = RSEED), param_grid, n_jobs = -1,scoring = 'accuracy', cv = 5 ,iid=False)
rs.fit(train_x_with_cluster,train_y)
pred_y=rs.predict(test_x_with_cluster)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
droping features
model=rs.best_estimator_
best_model,excluded_features=optimize_by_droping_features(train_x_with_cluster,train_y,test_x_with_cluster,test_y,3,score,model,param_grid)
final train and export to .csv
pred_y = final_train_and_predict(best_model,train_x_with_cluster,train_y,test_x_with_cluster,test_y,real_life_test_with_cluster,excluded_features)
export_to_csv(pred_y,passengers_id,'RandomForestWithCluster')
The base model
adb = AdaBoostClassifier()
adb.fit(train_x,train_y)
pred_y = adb.predict(test_x)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
The tuned model
adb_param_grid = {'n_estimators' : [10,20,50,100,200,300,400,500,1000],
'learning_rate': [0.001,0.01,0.03 ,0.05, 0.07,0.1,0.5, 1],
'algorithm' : ['SAMME.R'],
'random_state' : [RSEED] }
gs = GridSearchCV(adb,param_grid = adb_param_grid, cv = 2, n_jobs = -1,iid=False)
best_adb = gs.fit(train_x,train_y)
pred_y = best_adb.predict(test_x)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
droping features
model = gs.best_estimator_
best_model,excluded_features=optimize_by_droping_features(train_x,train_y,test_x,test_y,3,score,model,adb_param_grid)
final train and export to .csv
pred_y = final_train_and_predict(best_model,train_x,train_y,test_x,test_y,real_life_test,excluded_features)
export_to_csv(pred_y,passengers_id,'AdaboostWithoutCluster')
The base model
adb = AdaBoostClassifier()
adb.fit(train_x_with_cluster,train_y)
pred_y = adb.predict(test_x_with_cluster)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
The tuned model
adb_param_grid = {'n_estimators' : [10,20,50,100,200,300,400,500,1000],
'learning_rate': [0.001,0.01,0.03 ,0.05, 0.07,0.1,0.5, 1],
'algorithm' : ['SAMME.R'],
'random_state' : [RSEED] }
gs = GridSearchCV(adb,param_grid = adb_param_grid, cv = 2, n_jobs = -1,iid=False)
best_adb = gs.fit(train_x_with_cluster,train_y)
pred_y = best_adb.predict(test_x_with_cluster)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ',score)
droping features
model = gs.best_estimator_
best_model,excluded_features=optimize_by_droping_features(train_x_with_cluster,train_y,test_x_with_cluster,test_y,3,score,model,adb_param_grid)
final train and export to .csv
pred_y = final_train_and_predict(best_model,train_x_with_cluster,train_y,test_x_with_cluster,test_y,real_life_test_with_cluster,excluded_features)
export_to_csv(pred_y,passengers_id,'AdaboostWithCluster')
knn = KNeighborsClassifier(n_neighbors=10, metric='euclidean')
knn.fit(train_x, train_y)
pred_y = knn.predict(test_x)
score = accuracy_score(pred_y,test_y)
print('Accuracy : ', score)
view_correlation(train.drop('Survived',1))
knn = KNeighborsClassifier(n_neighbors=10, metric='euclidean')
knn.fit(train_x.drop('Pclass',1), train_y)
pred_y = knn.predict(test_x.drop('Pclass',1))
score = accuracy_score(pred_y,test_y)
print('Accuracy : ', score)
knn = KNeighborsClassifier(n_neighbors=10, metric='euclidean')
knn.fit(train_x.drop('Fare',1), train_y)
pred_y = knn.predict(test_x.drop('Fare',1))
score = accuracy_score(pred_y,test_y)
print('Accuracy : ', score)
the base model with all features : 0.707
the base model without Pclass : 0.704 (-003)
The Tuned model
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,50))
p = [1,2,3,4]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Use GridSearch
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, hyperparameters, cv = 10, iid=False)
#Fit the model
best_model = clf.fit(train_x.drop('Fare',1), train_y)
#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
pred_y = best_model.predict(test_x.drop('Fare',1))
score = accuracy_score(pred_y, test_y)
print('Accuracy : ', score)
final train and export to .csv
pred_y = final_train_and_predict(best_model,train_x,train_y,test_x,test_y,real_life_test,['Fare'])
export_to_csv(pred_y,passengers_id,'KnnWithoutCluster')
The base model
knn = KNeighborsClassifier(n_neighbors=10, metric='euclidean')
knn.fit(train_x_with_cluster.drop('Fare',1), train_y)
pred_y = knn.predict(test_x_with_cluster.drop('Fare',1))
score = accuracy_score(pred_y,test_y)
print('Accuracy : ', score)
The Tuned model
#Use GridSearch
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, hyperparameters, cv=10,iid=False)
#Fit the model
best_model = clf.fit(train_x_with_cluster.drop('Fare',1),train_y)
#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
score = accuracy_score(best_model.predict(test_x_with_cluster.drop('Fare',1)),test_y)
print()
print('Accuracy : ', score)
final train and export to .csv
pred_y = final_train_and_predict(best_model,train_x_with_cluster,train_y,test_x_with_cluster,test_y,real_life_test_with_cluster,['Fare'])
export_to_csv(pred_y,passengers_id,'KnnWithCluster')
The Base model
SVM = SVC(random_state=RSEED,gamma='auto')
SVM.fit(train_x,train_y)
pred_y = SVM.predict(test_x)
score = accuracy_score(pred_y,test_y)
print("Accuracy : ", score)
SVM
The Tuned model
param_grid = {'kernel': ['linear', 'rbf'],
'degree': [2,3,6,7],
'gamma' : [0.00001, 0.01, 0.1, 1],
'C' : [1, 10, 1000]}
grid = GridSearchCV(SVC(RSEED),param_grid,cv=3,verbose=0,iid=False)
grid.fit(train_x,train_y)
grid.best_estimator_
pred_y = grid.predict(test_x)
score = accuracy_score(pred_y,test_y)
print("Accuracy : ", score)
final train and export to .csv
pred_y = final_train_and_predict(grid.best_estimator_,train_x,train_y,test_x,test_y,real_life_test,[])
export_to_csv(pred_y,passengers_id,'SvmWithoutCluster')
The Base model
SVM = SVC(random_state=RSEED,gamma='auto')
SVM.fit(train_x_with_cluster,train_y)
pred_y = SVM.predict(test_x_with_cluster)
score = accuracy_score(pred_y,test_y)
print("Accuracy : ", score)
The Tuned model
grid = GridSearchCV(SVC(RSEED),param_grid,cv=3,verbose=0,iid=False)
grid.fit(train_x_with_cluster,train_y)
print(grid.best_estimator_)
pred_y = grid.predict(test_x_with_cluster)
score = accuracy_score(pred_y,test_y)
print("Accuracy : ", score)
final train and export to .csv
pred_y = final_train_and_predict(grid.best_estimator_,train_x_with_cluster,train_y,test_x_with_cluster,test_y,real_life_test_with_cluster,[])
export_to_csv(pred_y,passengers_id,'SvmWithCluster')
The Base model
#defining ann classifier using sklearn's MLPClassifier method
ann = MLPClassifier(random_state=RSEED)
# fitting model.
ann.fit(train_x, train_y)
# making predictions
pred_y = ann.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)
print("Accuracy: " + str(accuracy))
The Tuned model
# using sklearn's method GridSearchCV
param_grid = {'batch_size': [8,16,18,20],
'solver': ['sgd','adam'],
'hidden_layer_sizes': [(8,8,8),(8,8),(8,12),(12),(12,12),(8,12,2)],
'random_state' : [RSEED],
'alpha' : [0.1,0.01,1],
'max_iter':[500]}
ann_gs = GridSearchCV(ann,param_grid,cv=5, iid=False)
ann_gs.fit(train_x, train_y)
pred_y = ann_gs.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)
print("Accuracy: " + str(accuracy))
print(ann_gs.best_params_)
final train and export to .csv
pred_y = final_train_and_predict(ann_gs.best_estimator_,train_x,train_y,test_x,test_y,real_life_test,[])
export_to_csv(pred_y,passengers_id,' MlpWithoutCluster')
The Tuned model
#optimization to Neural Nets model
ann_gs = GridSearchCV(ann, param_grid,cv=5, iid=False)
ann_gs.fit(train_x_with_cluster, train_y)
pred_y = ann_gs.predict(test_x_with_cluster)
accuracy = accuracy_score(test_y, pred_y)
print("Accuracy: " + str(accuracy))
final train and export to .csv
pred_y = final_train_and_predict(ann_gs.best_estimator_,train_x_with_cluster,train_y,test_x_with_cluster,test_y,real_life_test_with_cluster,[])
export_to_csv(pred_y,passengers_id,' MlpWithCluster')
pca = PCA()
rfc = RandomForestClassifier(random_state= RSEED*2, n_estimators=10)
rfc.fit(train_x,train_y)
param_grid = {
'n_estimators': [10,100, 200, 250] , # The number of trees in the forest.
'max_depth': [None, 50, 60, 70] , # The maximum depth of the tree.
'max_features': ['sqrt', None], # he number of features to consider when looking for the best split
'min_samples_split': [2, 10], # The minimum number of samples required to split an internal node
'bootstrap': [True, False] , # Whether bootstrap samples are used when building trees.
'random_state':[RSEED*2]
}
classifier = GridSearchCV(rfc, param_grid, n_jobs = -1,scoring = 'accuracy', cv = 5 ,iid=False)
X_transformed = pca.fit_transform(train_x)
classifier.fit(X_transformed, train_y)
newdata_transformed = pca.transform(test_x)
pred_labels = classifier.predict(newdata_transformed)
score = accuracy_score(test_y,pred_labels)
print("Accuracy : ", score)
final train and export to .csv
pred_y = final_train_and_predict(classifier,train_x,train_y,test_x,test_y,real_life_test,[])
export_to_csv(pred_y,passengers_id,'PcaWithoutCluster')
classifier = GridSearchCV(rfc, param_grid, n_jobs = -1,scoring = 'accuracy', cv = 5 ,iid=False)
X_transformed = pca.fit_transform(train_x_with_cluster)
classifier.fit(X_transformed, train_y)
newdata_transformed = pca.transform(test_x_with_cluster)
pred_labels = classifier.predict(newdata_transformed)
score = accuracy_score(test_y,pred_labels)
print("Accuracy : ", score)
final train and export to .csv
pred_y = final_train_and_predict(classifier,train_x_with_cluster,train_y,test_x_with_cluster,test_y,real_life_test_with_cluster,[])
export_to_csv(pred_y,passengers_id,'PcaWithCluster')
The Base model
xgb = XGBClassifier( random_state = RSEED)
xgb.fit(train_x,train_y)
pred_y = xgb.predict(test_x)
score = accuracy_score(test_y,pred_y)
print("Accuracy : ", score)
The Tuned model
parameters = {
'max_depth': range (2, 15, 1),
'n_estimators': [10,50,100,250,300,350,400],
'gamma': [0.5, 1, 1.5, 2, 5],
'min_child_weight': [1, 5, 10],
'colsample_bytree': [0.6, 0.8, 1.0],
'learning_rate': [0.1, 0.5,0.01, 0.05,0.005,0.001]
}
xgb = XGBClassifier(random_state=RSEED)
clf = GridSearchCV(xgb, parameters, n_jobs=-1, cv=2, scoring='accuracy',verbose=2, refit=True)
clf.fit(train_x,train_y)
score = accuracy_score(test_y,clf.predict(test_x))
print("Accuracy : ", score)
final train and export to .csv
pred_y = final_train_and_predict(clf.best_estimator_,train_x,train_y,test_x,test_y,real_life_test,[])
export_to_csv(pred_y,passengers_id,'XGBoostWithoutCluster')
The Base model
xgb = XGBClassifier(random_state = RSEED)
xgb.fit(train_x_with_cluster,train_y)
pred_y = xgb.predict(test_x_with_cluster)
score = accuracy_score(test_y,pred_y)
print("Accuracy : ", score)
The Tuned model
xgb = XGBClassifier(random_state=RSEED)
clf = GridSearchCV(xgb, parameters, n_jobs=-1, cv=2, scoring='accuracy',verbose=2, refit=True)
clf.fit(train_x_with_cluster,train_y)
pred_y = clf.predict(test_x_with_cluster)
score = accuracy_score(test_y,pred_y)
print("Accuracy : ", score)
final train and export to .csv
pred_y = final_train_and_predict(clf.best_estimator_,train_x_with_cluster,train_y,test_x_with_cluster,test_y,real_life_test_with_cluster,[])
export_to_csv(pred_y,passengers_id,'XGBoostWithCluster')
%%html
<div class='tableauPlaceholder' id='viz1593447555580' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/Bo/Book1_15934475387770/Dashboard1/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='Book1_15934475387770/Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Bo/Book1_15934475387770/Dashboard1/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /><param name='filter' value='publish=yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1593447555580'); var vizElement = divElement.getElementsByTagName('object')[0]; if ( divElement.offsetWidth > 800 ) { vizElement.style.minWidth='420px';vizElement.style.maxWidth='1350px';vizElement.style.width='100%';vizElement.style.minHeight='587px';vizElement.style.maxHeight='887px';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.minWidth='420px';vizElement.style.maxWidth='1350px';vizElement.style.width='100%';vizElement.style.minHeight='587px';vizElement.style.maxHeight='887px';vizElement.style.height=(divElement.offsetWidth*0.35)+'px';} else { vizElement.style.width='100%';vizElement.style.height='600px';} var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
!jupyter nbconvert --to html Titanic.ipynb